# Loading Libraries 

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.0     v dplyr   1.0.5
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## Warning: package 'stringr' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(modelr)
library(nycflights13)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(splines)
library(forcats)
library(plotly)
## Warning: package 'plotly' was built under R version 4.0.5
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

Introduction and Description

The data is provided by data.gov.uk. It describes road accidents across Leeds in the year 2017. Between all datasets, this dataset was selected because it includes around 2,200 accident with 15 variable that can be easily parsed and explored. Also, we selected this dataset because we believe that there are different questions that can be asked about the data, and that the data can provide interesting answers for our questions. The dataset includes data such as location, date, time, road setting and casualty information for each accident that transpired during the year.

Reference : https://data.world/datagov-uk/6efe5505-941f-45bf-b576-4c1e09b579a1

Data Parsing and Import

# Loading & Parsing Data

Accidents_2017 <- read_csv("Data/datagov-uk-6efe5505-941f-45bf-b576-4c1e09b579a1/2017-8.csv", 
                           col_names = c("Reference_Number","Easting","Northing","Vehicles_Num","Accident_Date","Time","Road_Class","Road_Surface",
                                         "Lightning_Cond","Weather_Cond","Vehicle_Type","Casualty_Class","Severity","Gender","Age"), skip = 1)
## 
## -- Column specification --------------------------------------------------------
## cols(
##   Reference_Number = col_character(),
##   Easting = col_double(),
##   Northing = col_double(),
##   Vehicles_Num = col_double(),
##   Accident_Date = col_character(),
##   Time = col_character(),
##   Road_Class = col_character(),
##   Road_Surface = col_character(),
##   Lightning_Cond = col_character(),
##   Weather_Cond = col_character(),
##   Vehicle_Type = col_character(),
##   Casualty_Class = col_character(),
##   Severity = col_character(),
##   Gender = col_character(),
##   Age = col_double()
## )
Accidents_Date <- Accidents_2017$Accident_Date
Accidents_Date <- parse_date(Accidents_Date, "%m/%d/%Y")

Time <- Accidents_2017$Time
Time <- parse_time(Time, "%H%M")

Accidents_2017 <- Accidents_2017 %>%
  select(1:4, 7:15) %>%
  mutate(Accidents_Date, Time) 

Accidents_2017 <- Accidents_2017[,c(1:4,14,15,5:13)]

# Correcting spelling mistakes

Lightning_Cond <- 
  str_replace(Accidents_2017$Lightning_Cond,"Darkness: Street lights present and lit and lit","Darkness: Street lights present and lit")


Road_Class <- str_replace_all(Accidents_2017$Road_Class,
      c("A.*" = "A","B.*" = "B","M.*" = "M"))


Road_Surface <- str_replace(Accidents_2017$Road_Surface,"^F.*","Snow")


Weather_Cond <- word(Accidents_2017$Weather_Cond,1) 
Weather_Cond <- str_replace(Weather_Cond,"Fog","Other")


Vehicle_Type <- word(Accidents_2017$Vehicle_Type,1)
Vehicle_Type <- str_replace_all(Vehicle_Type,
      c(".Private" = "Taxi",
        "Ca.*" = "Car","Pedal" = "Cycle"))
Vehicle_Type <- str_replace(Vehicle_Type,"TaxiTaxi", "Taxi")


# Adding the improved columns to the dataset 

Accidents_2017 <- Accidents_2017 %>% 
  select(1:6,12:15) %>%
  mutate(Lightning_Cond,Weather_Cond,Road_Class,Road_Surface,Vehicle_Type,
         Year = year(Accidents_Date),
         Month = month(Accidents_Date),
         Day = day(Accidents_Date),
         Hour = hour(Time),
         Minute = minute(Time),
         Accidents_DateTime = make_datetime(Year, Month, Day, Hour, Minute))

Accidents_2017 <- Accidents_2017[,c(1:4, 16:20, 5:6, 21, 7:15)]

# Assigning the columns to their proper classes. 

for(i in 2:length(Accidents_2017)) {
    if(is.character(Accidents_2017[[i]])) {
        Accidents_2017[[i]] <- as.factor(Accidents_2017[[i]])
    } else if (is.numeric(Accidents_2017[[i]])) {
        Accidents_2017[[i]] <- as.numeric(Accidents_2017[[i]])
    }
}


map(Accidents_2017,class)
## $Reference_Number
## [1] "character"
## 
## $Easting
## [1] "numeric"
## 
## $Northing
## [1] "numeric"
## 
## $Vehicles_Num
## [1] "numeric"
## 
## $Year
## [1] "numeric"
## 
## $Month
## [1] "numeric"
## 
## $Day
## [1] "numeric"
## 
## $Hour
## [1] "numeric"
## 
## $Minute
## [1] "numeric"
## 
## $Accidents_Date
## [1] "Date"
## 
## $Time
## [1] "hms"      "difftime"
## 
## $Accidents_DateTime
## [1] "POSIXct" "POSIXt" 
## 
## $Casualty_Class
## [1] "factor"
## 
## $Severity
## [1] "factor"
## 
## $Gender
## [1] "factor"
## 
## $Age
## [1] "numeric"
## 
## $Lightning_Cond
## [1] "factor"
## 
## $Weather_Cond
## [1] "factor"
## 
## $Road_Class
## [1] "factor"
## 
## $Road_Surface
## [1] "factor"
## 
## $Vehicle_Type
## [1] "factor"
Accidents_2017
## # A tibble: 2,203 x 21
##    Reference_Number Easting Northing Vehicles_Num  Year Month   Day  Hour Minute
##    <chr>              <dbl>    <dbl>        <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
##  1 3AP0313           426340   428455            1  2017     3    17     8     15
##  2 3BE0850           430828   433222            2  2017     1    14    13     30
##  3 4110858           428940   429856            2  2017     1     1     8      5
##  4 4110858           428940   429856            2  2017     1     1     8      5
##  5 4111495           429899   434277            2  2017     1     1    17      5
##  6 4111706           435946   436807            2  2017     1     1    12      0
##  7 4120471           443658   436768            3  2017     1     2    12     30
##  8 4120471           443658   436768            3  2017     1     2    12     30
##  9 4121054           442103   434572            2  2017     1     2    18      7
## 10 4121054           442103   434572            2  2017     1     2    18      7
## # ... with 2,193 more rows, and 12 more variables: Accidents_Date <date>,
## #   Time <time>, Accidents_DateTime <dttm>, Casualty_Class <fct>,
## #   Severity <fct>, Gender <fct>, Age <dbl>, Lightning_Cond <fct>,
## #   Weather_Cond <fct>, Road_Class <fct>, Road_Surface <fct>,
## #   Vehicle_Type <fct>
# Checking NAs 

sum(rowSums(is.na(Accidents_2017)))
## [1] 0
# No NA values to explore

Exploring Data Analysis

EDA1 <- Accidents_2017 %>%
  group_by(Gender, Age) %>%
  mutate(Count = n() ) %>%
  ungroup() %>%
  ggplot(mapping = aes(x=Age , y = Count)) +
  geom_point(alpha = 0.6) + 
  facet_wrap(~Gender) +
  ggtitle("Number of Casualties Per Age For Males and Females") +
  ylab("Number of Casualties") 

layout_plot <- function(my_plot, x = -0.057, y = - 0.033){
  my_plot[['x']][['layout']][['annotations']][[1]][['y']] <- x
  my_plot[['x']][['layout']][['annotations']][[2]][['x']] <- y
  my_plot
}

ggplotly(EDA1) %>% layout_plot
Accidents_2017 %>%
  group_by(Gender, Age) %>%
  summarise(Count = n() )
## `summarise()` has grouped output by 'Gender'. You can override using the `.groups` argument.
## # A tibble: 182 x 3
## # Groups:   Gender [2]
##    Gender   Age Count
##    <fct>  <dbl> <int>
##  1 Female     1     8
##  2 Female     2     3
##  3 Female     3     6
##  4 Female     4     2
##  5 Female     5     6
##  6 Female     6     7
##  7 Female     7     8
##  8 Female     8     5
##  9 Female     9     4
## 10 Female    10     9
## # ... with 172 more rows

The plot shows the relationship between the Number of Casualities and their Age. The graph is essential because it helps us identify which age group are more likely to be casualties. In addition, the plot is classified by the Gender, therefore; it shows how the relationship differs between males and females.

For females, as the age increases the count increases. The count peaks around the mid-20s, then the count declines gradually. The males’ graph has a similar shape as the females’. The only difference is that the males’ graph has a higher peak and the drop is sharper.

EDA2 <- Accidents_2017 %>%
  group_by(Casualty_Class, Age) %>%
  summarise(Count = n() ) %>%
  ungroup() %>%
  ggplot(mapping = aes(x=Age , y = Count)) +
  geom_point(alpha = 0.6) + 
  geom_smooth()+
  facet_wrap(~Casualty_Class) +
  ggtitle("Count against Age based on Casualty class") +
  ylab("Number of Casualties")
## `summarise()` has grouped output by 'Casualty_Class'. You can override using the `.groups` argument.
layout_plot <- function(my_plot, x = -0.057, y = - 0.033){
  my_plot[['x']][['layout']][['annotations']][[1]][['y']] <- x
  my_plot[['x']][['layout']][['annotations']][[2]][['x']] <- y
  my_plot
}

ggplotly(EDA2) %>% layout_plot
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Accidents_2017 %>%
  group_by(Casualty_Class,Age) %>%
  summarise(Count = n() )
## `summarise()` has grouped output by 'Casualty_Class'. You can override using the `.groups` argument.
## # A tibble: 260 x 3
## # Groups:   Casualty_Class [3]
##    Casualty_Class    Age Count
##    <fct>           <dbl> <int>
##  1 Driver or rider     4     1
##  2 Driver or rider     6     2
##  3 Driver or rider     7     1
##  4 Driver or rider     8     3
##  5 Driver or rider     9     1
##  6 Driver or rider    10     3
##  7 Driver or rider    11     4
##  8 Driver or rider    12     5
##  9 Driver or rider    13     5
## 10 Driver or rider    14     3
## # ... with 250 more rows
Accidents_2017 %>%
  group_by(Casualty_Class) %>%
  summarise(Count = n() )
## # A tibble: 3 x 2
##   Casualty_Class               Count
##   <fct>                        <int>
## 1 Driver or rider               1296
## 2 Pedestrian                     321
## 3 Vehicle or pillion passenger   586

The plot shows the relationship between the Number of Casualities and the Age for each Casualty class. This plot is very important as it helps understand which type of casualty each age group is likely to be in a road accidents which we can use to raise awareness for different age groups separately.

When the casualty is a driver or rider, as the age increase till the mid-30s the count rockets,then the count falls rapidly. The plots for pedestrians and passengers are very similar,the only disparity is that the passengers have a higher peak. It should be highlighted that the number of casualties between the age 20 till mid-50s is awfully high when the casualty is a driver or rider compared to other casualty classes.

Sections Answering Questions

Why does the number of accidents increase towards the end of the year ?

daily <- Accidents_2017 %>%
  group_by(Accidents_Date) %>%
  summarise(n = n()) 

daily
## # A tibble: 360 x 2
##    Accidents_Date     n
##    <date>         <int>
##  1 2017-01-01         4
##  2 2017-01-02         7
##  3 2017-01-03         3
##  4 2017-01-04         7
##  5 2017-01-05         4
##  6 2017-01-06         5
##  7 2017-01-07         5
##  8 2017-01-09         5
##  9 2017-01-10         9
## 10 2017-01-11         7
## # ... with 350 more rows
ggplot(daily, mapping = aes(Accidents_Date, n)) +
  geom_line()+
  geom_smooth()+
  ggtitle("Number of accidents against Date")+
  ylab("Number of accidents")+
  xlab("Date")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# From the plot, we notice that from January till April the number of accidents per day keep decreasing. From April till around August the number of accidents per day stays constant. From August till the end of the year the number generally increases. Why is the number of accidents high at the start and end of the year?. 

# Let's make a model to boarden our understanding. We will assume that number of accidents has a linear relationship with the variable Accident_Date.

mod <- lm(n ~ Accidents_Date,data = daily)

grid <- daily %>%
  data_grid(Accidents_Date) %>%
  add_predictions(mod)

ggplot(daily,aes(Accidents_Date,n))+
  geom_line()+
  geom_point(aes(y = pred),data = grid,colour = "red")+
   ggtitle("Number of accidents against Date with predictions")+
  ylab("Number of accidents")+
  xlab("Date")

daily <- daily %>%
  add_residuals(mod)

Resid_plot1 <- ggplot(daily,aes(Accidents_Date,resid))+
  geom_point()+
  geom_ref_line(h = 0) +
  geom_smooth()+
  xlab("Date")+
  ylab("Residuals")+
  ggtitle("Residuals against Date")

ggplotly(Resid_plot1) 
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# The model is not very good,it underestimates the January and fourth quarter of the year. 

# Let's see the number of accidents per month and visualize it 

Accidents_by_month <- 
    Accidents_2017 %>%
    count(Month)
    


Accidents_by_month
## # A tibble: 12 x 2
##    Month     n
##    <dbl> <int>
##  1     1   199
##  2     2   157
##  3     3   174
##  4     4   173
##  5     5   162
##  6     6   167
##  7     7   177
##  8     8   164
##  9     9   182
## 10    10   223
## 11    11   234
## 12    12   191
# Plot of Accidents per month
ggplot(data = Accidents_by_month, mapping = aes(x = Month, y = n)) +
  geom_bar(aes(fill = Month) , show.legend = F, stat = "identity") +
  ggtitle("Number of Accidents Per Month") +
  xlab("Month") +
  ylab("Number of Accidents")

# From the bar chart we understand why our model underestimates the stated months because the actual number of accidents for these months is unusually high.


# It is important to note that in Leeds, during January, October, November and December the climate is different to the rest of the year. The weather is very cloudy, the temperature drops and the number of daylight hours are low compared to the rest of the years. Could this be the reason to the high count?. Let's explore the data for these months.

Months <- Accidents_2017%>%
  filter( Month %in% c(1,9,10,11,12))
Months
## # A tibble: 1,029 x 21
##    Reference_Number Easting Northing Vehicles_Num  Year Month   Day  Hour Minute
##    <chr>              <dbl>    <dbl>        <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
##  1 3BE0850           430828   433222            2  2017     1    14    13     30
##  2 4110858           428940   429856            2  2017     1     1     8      5
##  3 4110858           428940   429856            2  2017     1     1     8      5
##  4 4111495           429899   434277            2  2017     1     1    17      5
##  5 4111706           435946   436807            2  2017     1     1    12      0
##  6 4120471           443658   436768            3  2017     1     2    12     30
##  7 4120471           443658   436768            3  2017     1     2    12     30
##  8 4121054           442103   434572            2  2017     1     2    18      7
##  9 4121054           442103   434572            2  2017     1     2    18      7
## 10 4121054           442103   434572            2  2017     1     2    18      7
## # ... with 1,019 more rows, and 12 more variables: Accidents_Date <date>,
## #   Time <time>, Accidents_DateTime <dttm>, Casualty_Class <fct>,
## #   Severity <fct>, Gender <fct>, Age <dbl>, Lightning_Cond <fct>,
## #   Weather_Cond <fct>, Road_Class <fct>, Road_Surface <fct>,
## #   Vehicle_Type <fct>
Accidents_2017%>%
  count(Lightning_Cond)
## # A tibble: 5 x 2
##   Lightning_Cond                                n
##   <fct>                                     <int>
## 1 Darkness: No street lighting                 46
## 2 Darkness: Street lighting unknown           432
## 3 Darkness: Street lights present and lit     535
## 4 Darkness: Street lights present but unlit     9
## 5 Daylight: Street lights present            1181
Months %>%
  count(Lightning_Cond)
## # A tibble: 5 x 2
##   Lightning_Cond                                n
##   <fct>                                     <int>
## 1 Darkness: No street lighting                 27
## 2 Darkness: Street lighting unknown           130
## 3 Darkness: Street lights present and lit     364
## 4 Darkness: Street lights present but unlit     2
## 5 Daylight: Street lights present             506
Accidents_2017%>%
  count(Road_Surface)
## # A tibble: 3 x 2
##   Road_Surface     n
##   <fct>        <int>
## 1 Dry           1647
## 2 Snow            35
## 3 Wet/Damp       521
Months %>%
  count(Road_Surface)
## # A tibble: 3 x 2
##   Road_Surface     n
##   <fct>        <int>
## 1 Dry            687
## 2 Snow            34
## 3 Wet/Damp       308
Accidents_2017%>%
  count(Weather_Cond)
## # A tibble: 4 x 2
##   Weather_Cond     n
##   <fct>        <int>
## 1 Fine          1972
## 2 Other           21
## 3 Raining        202
## 4 Snowing          8
Months %>%
  count(Weather_Cond)
## # A tibble: 4 x 2
##   Weather_Cond     n
##   <fct>        <int>
## 1 Fine           936
## 2 Other           11
## 3 Raining         75
## 4 Snowing          7
# From our data, we see that during January, October, November and December there is a 4.4% increase in accidents in darkness, 8% increase in accidents when the road surface is wet or snow compared to the rest of the year. There was no relevant change in Weather conditions so we will ignore it for now.

# Let's try to model the relationship between road surface and lightning conditions and the count for the months of the year.

Climate <- Accidents_2017 %>%
  count(Accidents_Date,Month,Road_Surface,Lightning_Cond)


mod <- lm(n ~ Road_Surface + Lightning_Cond ,data = Climate)

grid <- Climate %>%
  data_grid(Month,Road_Surface,Lightning_Cond) %>%
  add_predictions(mod)

ggplot(Climate,aes(Month,n))+
  geom_point()+
  geom_point(data = grid,aes(y = pred),colour = "red")+
  ggtitle("Number of accidents against Month with predictions")+
  ylab("Number of accidents") +
  xlab("Month")

Climate <- Climate %>%
  add_residuals(mod)

Climate
## # A tibble: 953 x 6
##    Accidents_Date Month Road_Surface Lightning_Cond                     n  resid
##    <date>         <dbl> <fct>        <fct>                          <int>  <dbl>
##  1 2017-01-01         1 Dry          Daylight: Street lights prese~     1 -1.87 
##  2 2017-01-01         1 Wet/Damp     Darkness: Street lights prese~     1 -0.721
##  3 2017-01-01         1 Wet/Damp     Daylight: Street lights prese~     2 -0.376
##  4 2017-01-02         1 Dry          Darkness: No street lighting       3  0.911
##  5 2017-01-02         1 Dry          Darkness: Street lights prese~     1 -1.22 
##  6 2017-01-02         1 Dry          Daylight: Street lights prese~     2 -0.875
##  7 2017-01-02         1 Wet/Damp     Darkness: Street lights prese~     1 -0.721
##  8 2017-01-03         1 Dry          Darkness: Street lights prese~     2 -0.220
##  9 2017-01-03         1 Dry          Daylight: Street lights prese~     1 -1.87 
## 10 2017-01-04         1 Dry          Darkness: Street lights prese~     3  0.780
## # ... with 943 more rows
Resid_plot2 <- ggplot(Climate,aes(Month,resid)) +
  geom_hex()+
  geom_ref_line(h = 0)+
  geom_smooth()+
  ggtitle("Residuals against Month")+
  ylab("Residuals")+
  xlab("Month")

ggplotly(Resid_plot2)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#  Although there are some months with huge outliers the model can accurately predict for the month of January and for the fourth quarter of the year. So lets use the residuals on the Accident_Date variable to remove the pattern. 


Resid_plot3 <- ggplot(Climate,aes(Accidents_Date,resid))+
  geom_hex()+
  geom_ref_line(h = 0) +
  geom_smooth()+
   ggtitle("Residuals against Date")+
  ylab("Residuals")+
  xlab("Date")

ggplotly(Resid_plot3)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# Now we can look at the residuals, which verifies that we’ve substantially  decreased the linear pattern.

Through the extensive analysis that we did, we found that the number of casualties increases towards the end of the year because these months have an awful climate which makes it harder to drive. The awful Climate increases the percentage of accidents in the dark and percentage of accidents when the surface of the road is either snowy or wet which leads to higher accidents in these months. When we use a model to remove the effect caused by the climate and visualize the relationship between residuals and Accidents_dates we get a good plot. Although the residuals seem a bit biased we were able to decrease the distance between the highest and lowest residual and we got a better smooth line that has a lower confidence interval which proves we are on the right track.

Does the day of the week affect the time at which accidents happen ?

# Let's started by counting the total number of accidents for each hour and visualizing it with ggplot2


accident_hour <- Accidents_2017 %>%
  count(Hour) 

accident_hour
## # A tibble: 24 x 2
##     Hour     n
##    <dbl> <int>
##  1     0    30
##  2     1    23
##  3     2     9
##  4     3    15
##  5     4    16
##  6     5    12
##  7     6    41
##  8     7    85
##  9     8   143
## 10     9   105
## # ... with 14 more rows
ggplot(accident_hour,aes(Hour,n))+
  geom_line()+
  geom_point()+
  ggtitle("Number of accidents against Hour")+
  ylab("Number of accidents")+
  xlab("Hour")

# Let's try to model the relationship between hours and count.

mod1 <- lm(n ~ ns(Hour,1) ,data = accident_hour)
mod2 <- lm(n ~ ns(Hour,2) ,data = accident_hour)
mod3 <- lm(n ~ ns(Hour,3) ,data = accident_hour)
mod4 <- lm(n ~ ns(Hour,4) ,data = accident_hour)
mod5 <- lm(n ~ ns(Hour,5) ,data = accident_hour)
mod6 <- lm(n ~ ns(Hour,6) ,data = accident_hour)

grid <- accident_hour %>%
  data_grid(Hour = seq_range(Hour,n = 50,expand = 0.1)) %>%
  gather_predictions(mod1,mod2,mod3,mod4,mod5,mod6,.pred = "Y")
grid
## # A tibble: 300 x 3
##    model   Hour     Y
##    <chr>  <dbl> <dbl>
##  1 mod1  -1.15   34.6
##  2 mod1  -0.634  37.0
##  3 mod1  -0.117  39.3
##  4 mod1   0.399  41.6
##  5 mod1   0.915  44.0
##  6 mod1   1.43   46.3
##  7 mod1   1.95   48.6
##  8 mod1   2.46   51.0
##  9 mod1   2.98   53.3
## 10 mod1   3.50   55.6
## # ... with 290 more rows
ggplot(accident_hour,aes(Hour,n))+
  geom_point()+
  geom_smooth(data = grid,aes(y = Y),colour = "red")+
  facet_wrap(~ model)+
  ggtitle("Number of accidents against Hour with predictions")+
  ylab("Number of accidents")+
  xlab("Hour")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

accident_hour <- accident_hour %>%
  gather_residuals(mod1,mod2,mod3,mod4,mod5,mod6)

Resid_plot4 <- ggplot(accident_hour,aes(Hour,resid)) +
  geom_point()+ 
  geom_ref_line(h = 0)+ 
  geom_smooth()+
  facet_wrap(~model)+
  ggtitle("Residuals against Hour")+
  ylab("Residuals")+
  xlab("Hour")


ggplotly(Resid_plot4) %>% layout_plot
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# It seems that as we increase the degrees of freedom, the better our model gets. We will stick with 6 degrees of freedom for the rest of the analysis.


# Now we will add a day of the week variable (dow) and visualize the relationship between count and hour for each day of the week.

dow_hour <- Accidents_2017 %>%
  mutate(dow = wday(Accidents_DateTime,label =TRUE))%>%
  count(dow,Hour) 

dow_hour
## # A tibble: 158 x 3
##    dow    Hour     n
##    <ord> <dbl> <int>
##  1 Sun       0    14
##  2 Sun       1     6
##  3 Sun       2     2
##  4 Sun       3     2
##  5 Sun       4     6
##  6 Sun       6     5
##  7 Sun       7     1
##  8 Sun       8     8
##  9 Sun       9     6
## 10 Sun      10     5
## # ... with 148 more rows
ggplot(dow_hour,aes(Hour,n,colour = dow))+
  geom_line()+
  geom_point() +
  labs(col = "Days of the Week")+
  ggtitle("Number of accidents against Hour based on day of the week")+
  ylab("Number of accidents")+
  xlab("Hour")

# From the graph we notice that Sunday and Saturday have a different shape from the rest of the days. Both have a low count between 6 and 9 am  compared to other days,  Saturday  peaks at an earlier time compared to other days. Moreover, Sunday has a lower count between 4 and 7 pm. The rest of the days have a similar shapes and follow the same trends. I think this is because the Sunday and Saturday are weekend days.



#Lets use a model to understand the relationship better. 

mod <- lm(n ~ ns(Hour,6) ,data = dow_hour)
mod1 <- lm(n ~ ns(Hour,6) + dow,data = dow_hour)
mod2 <- lm(n ~ ns(Hour,6) * dow,data = dow_hour)

grid <- dow_hour %>%
  data_grid(Hour = seq_range(Hour,n = 50,expand = 0.1),dow) %>%
  gather_predictions(hour_alone = mod,plus_dow = mod1,multi_dow = mod2,.pred = "Y")
grid
## # A tibble: 1,050 x 4
##    model        Hour dow       Y
##    <chr>       <dbl> <ord> <dbl>
##  1 hour_alone -1.15  Sun    8.12
##  2 hour_alone -1.15  Mon    8.12
##  3 hour_alone -1.15  Tue    8.12
##  4 hour_alone -1.15  Wed    8.12
##  5 hour_alone -1.15  Thu    8.12
##  6 hour_alone -1.15  Fri    8.12
##  7 hour_alone -1.15  Sat    8.12
##  8 hour_alone -0.634 Sun    6.89
##  9 hour_alone -0.634 Mon    6.89
## 10 hour_alone -0.634 Tue    6.89
## # ... with 1,040 more rows
ggplot(dow_hour,aes(Hour,n))+
  geom_point()+
  geom_smooth(data = grid,aes(y = Y),colour = "red")+
  facet_wrap(~ model)+
  ggtitle("Number of accidents against Hour with prediction based on model")+
  ylab("Number of accidents")+
  xlab("Hour")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

dow_hour <- dow_hour %>%
  gather_residuals(hour_alone = mod,plus_dow = mod1,multi_dow = mod2)

Resid_plot5 <- ggplot(dow_hour,aes(Hour,resid)) +
  geom_point()+ geom_ref_line(h = 0)+ geom_smooth()+
  facet_wrap(~model)+
   ggtitle("Residuals against Hour based on model")+
  ylab("Residuals")+
  xlab("Hour")

ggplotly(Resid_plot5) %>% layout_plot()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# It looks like mod2 (Hour * day of the week) gives us the plot with the best residuals. Only 5 points from 168 points are outside a distance of 10 residuals.


dow_hour %>%
  filter(model == "multi_dow") %>%
ggplot(aes(Hour,resid,colour = dow))+
   geom_ref_line(h = 0) + 
  geom_point() +
  labs(col = "Days of the Week")+
   ggtitle("Residuals against Hour based on model 2")+
  ylab("Residuals")+
  xlab("Hour")

After the broad analysis we come to the conclusion that the day of the week affects the time at which accidents happen. On workdays, we notice that most accidents happen between 6 and 9 am and between 4 and 7 pm while on weekends they can vary. When we tried to model the relationship between day of the week and the hour at which accidents happen we found that Hour multiplied by day of the week gives us the best plot.We still have a few outliners which could suggest that there is another variable that impacts the time at which accidents happen. Lastly, we found as we increase the degrees of freedom we we’re able to model the relationship between Hour and number of accidents better.

Closing Discussion

From the EDA, these are the most interesting findings :

Firstly, The number of accidents per day for each climate is 2.97 when it is raining, 3.3 when it is snowing and 9.2 when the climate is dry. Secondly, most fatal accidents occur between 4 pm and 8 pm and fatal accidents tend to happen to people under the age of 25 more frequently. Thirdly, the number of infants and children casualties is dreadfully high when they are passengers in comparison to other casualty classes. Lastly, after 6 am the number of accidents keeps increasing till 5.30 pm.

From the model building, these are the most interesting findings :

We found as we increase the degrees of freedom we we’re able to model the relationship between Hour and number of accidents better. Day of the week affects the hour at which accidents happen but there are other hidden variables that play a role. Climate is the main reason for the increase in the number of accidents towards the end of the year.

Future work we would like to do :

Firstly, we want to explore why do most accidents happen when light is available. The question is thought-provoking and could lead to interesting conclusions. Secondly, after seeing how the climate influences road accidents we want to analyze how the seasons impact the number of road accidents. Thirdly, we want to build a model including all the variables that effect driving conditions to find which variable heavily influences the count of accidents.